library(ggplot2)
library(caTools)
library(plotly)
library(MASS)
library(matlib)
# Set random seed for reproducibility
set.seed(123)
auc <- 0.95 # AUC
y <- 0.0 # delta sigma
z <- 0.2 # z
p <- 1 # proportion of correlated predictors
npred <- 3 # number of predictors
n <- 10000 # Number of samples in each class
n_iter <- 400 # Number of iterations for logistic regression
auc_values <- numeric(n_iter) # Vector to store AUC values
sample_ratio <- 0.3 # Proportion of data to use in each logistic regression
calculate sigma0 and sigma1
scenario <- function(){
# set up correlations
corr0 <- matrix(0, npred, npred) # matrix: set up for cov matrix, 0 on diagonals
corr0[1:npred*p, 1:npred*p] = z # class 0
diag(corr0) = 0
corr1 <- matrix(0, npred, npred) # matrix: set up for cov matrix, 0 on diagonals
corr1[1:npred*p, 1:npred*p] = (1-y)*z # class 1
diag(corr1) = 0
# covariance structures
sigma0 <- diag(npred) + corr0 # matrix: cov matrix of class 0
dsig <- diag(c(rep(y, npred))) # matrix: difference in variances between classes
sigma1 <- diag(npred) - dsig + corr1 # matrix: cov matrix of class 1
return(list("sigma0" = sigma0,
"sigma1" = sigma1))
}
calculate delta mu
get_delta_mu <- function(){
# based equation A2 in Appendix A
inn = scenario() # get covariance matrices
s0 = inn[[1]]
s1 = inn[[2]]
A = inv(s0 + s1) # calculate matrix A
Y = sum(colSums(A))
dmu = qnorm(auc) / sqrt(Y)
return(dmu)
}
get_delta_mu() # 0.6043
## [1] 1.58908
Simulate data for diseased class and non-diseased class using a covariance matrix and mean vector using the MASS package. The covariance matrix is chosen such that the two predictors are correlated. The mean vector is chosen such that the diseased class has a higher mean than the non-diseased class for both predictors. The data is then combined into a single dataset.
# mean structures
mu0 <- c(rep(0,npred)) # vector: class 0 means
mu1 <- mu0 + get_delta_mu() # vector: class 1 means
# covariance structures
inn = scenario() # get covariance matrices
sigma1 <- inn[[2]] # class 1
sigma0 <- inn[[1]] # class 0
# Generate data for diseased class
diseased_data <- mvrnorm(n, mu = mu1, Sigma = sigma1)
# Generate data for non-diseased class
non_diseased_data <- mvrnorm(n, mu = mu0, Sigma = sigma0)
# Combine the diseased and non-diseased data
combined_data <- rbind(diseased_data, non_diseased_data)
# Create data frame with class labels
data <- data.frame(combined_data, class = c(rep(1, n), rep(0, n)))
# Rename columns to "predictor1", "predictor2", ..., "class"
colnames(data) <- c(paste0("predictor", 1:(ncol(combined_data))), "class")
# Plot for Predictor 1
ggplot(data, aes(x = predictor1, fill = as.factor(class))) +
geom_histogram(alpha = 0.5, position = 'identity', bins = 30) +
ggtitle("Distribution of Diseased and Non-Diseased Classes for Predictor 1")
# Plot for Predictor 2
ggplot(data, aes(x = predictor2, fill = as.factor(class))) +
geom_histogram(alpha = 0.5, position = 'identity', bins = 30) +
ggtitle("Distribution of Diseased and Non-Diseased Classes for Predictor 2")
# Calculate densities for class 1 and class 0
density1 <- with(data[data$class == 1, ], kde2d(predictor1, predictor2, n = 30))
density0 <- with(data[data$class == 0, ], kde2d(predictor1, predictor2, n = 30))
# Create the interactive 3D plot
p1 <- plot_ly(z = ~density1$z) %>%
add_surface(
x = ~density1$x,
y = ~density1$y,
colorscale = list(c(0, "red"), list(1, "pink")),
opacity = 0.8
)
p2 <- plot_ly(z = ~density0$z) %>%
add_surface(
x = ~density0$x,
y = ~density0$y,
colorscale = list(c(0, "blue"), list(1, "lightblue")),
opacity = 0.8
)
p <- subplot(p1, p2, nrows = 1, shareX = TRUE, shareY = TRUE)
p
# Run logistic regression and calculate AUC for n_iter iterations
for(i in 1:n_iter) {
# Sample a subset of the data
sample_index <- sample(1:nrow(data), sample_ratio * nrow(data))
sample_data <- data[sample_index, ]
# Run logistic regression with two predictors
model <- glm(class ~ ., data = sample_data, family = binomial)
# Predict on the same sample set
predictions <- predict(model, newdata = sample_data, type = "response")
# Calculate AUC
auc <- colAUC(predictions, sample_data$class, plotROC = FALSE)
auc_values[i] <- auc # Directly store the AUC value
# Save the model predictors and coefficients to calculate average coefficients
if(i == 1) {
coefficients <- model$coefficients
} else {
coefficients <- coefficients + model$coefficients
}
}
# Calculate average AUC
average_auc <- mean(auc_values)
print(paste("Average AUC: ", round(average_auc, 2)))
## [1] "Average AUC: 0.95"
# Calculate average coefficients
average_coefficients <- coefficients / n_iter
print(paste("Average coefficients: ", round(average_coefficients, 2)))
## [1] "Average coefficients: -2.71" "Average coefficients: 1.11"
## [3] "Average coefficients: 1.16" "Average coefficients: 1.16"